{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['chatgpt-iium-confession-noisy-translation/clean-iium00.splitted.gpt.requested',\n",
       " 'chatgpt-iium-confession-noisy-translation/clean-iium01.splitted.gpt.requested',\n",
       " 'chatgpt-iium-confession-noisy-translation/clean-iium02.splitted.gpt.requested',\n",
       " 'chatgpt-iium-confession-noisy-translation/clean-iium03.splitted.gpt.requested',\n",
       " 'chatgpt-iium-confession-noisy-translation/iium-confession-gpt.json',\n",
       " 'chatgpt-iium-confession-noisy-translation/iium-confession-more.json']"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from glob import glob\n",
    "\n",
    "files = sorted(glob('chatgpt-iium-confession-noisy-translation/*'))\n",
    "files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\"src\": {\"text\": \"Assalamualaikum dan salam sejahtera. Hai geng. J is back. Sihat semua? Family sihat? Aku doakan semua sihat-sihat belaka. Terima kasih support aku. Aku cuma nak cerita kisah aku dan kawan-kawan aku. Kerja di ladang ni dah lama di anak tirikan oleh masyarakat. Sedih juga aku. Kami duduk hutan sawit ni tak kacau orang pun. Tapi pandangan masyarakat tu, hurmmm... Entahla.\"}, \"r\": {\"english\": \"Assalamualaikum and greetings. Hey guys, J is back. Are you all well? Is your family well? I pray that everyone is in good health. Thank you for your support. I just want to share my story and my friends' story. We have been working in this plantation for a long time and have been neglected by the community. It's sad. We live in this oil palm forest and don't bother anyone. But the community's perception, well... I don't know.\", \"malay\": \"Assalamualaikum dan salam sejahtera. Hai geng. J kembali. Semua sihat? Keluarga sihat? Saya berdoa agar semua sihat-sihat belaka. Terima kasih atas sokongan anda. Saya hanya ingin berkongsi kisah saya dan kisah rakan-rakan saya. Kami telah bekerja di ladang ini untuk waktu yang lama dan telah diabaikan oleh masyarakat. Sedih juga. Kami tinggal di hutan sawit ini dan tidak mengganggu sesiapa pun. Tetapi pandangan masyarakat, hurmmm... Entahlah.\"}}\r\n"
     ]
    }
   ],
   "source": [
    "!head -n 1 chatgpt-iium-confession-noisy-translation/clean-iium00.splitted.gpt.requested"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "iium_confession = []\n",
    "for f in files:\n",
    "    with open(f) as fopen:\n",
    "        if 'gpt.requested' in f:\n",
    "            for l in fopen:\n",
    "                data = json.loads(l)\n",
    "                iium_confession.append(data)\n",
    "        else:\n",
    "            data = json.load(fopen)\n",
    "            iium_confession.extend(data)\n",
    "            \n",
    "iium_confession = [a for a in iium_confession if a is not None]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████████████████████████████████████████████████████████████| 475485/475485 [00:00<00:00, 780054.79it/s]\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "filtered = []\n",
    "for d in tqdm(iium_confession):\n",
    "    if 'src' in d:\n",
    "        left = d['src']['text']\n",
    "        try:\n",
    "            en = d['r']['english']\n",
    "            ms = d['r']['malay']\n",
    "        except:\n",
    "            en = None\n",
    "            ms = None\n",
    "    else:\n",
    "        left = d[0]\n",
    "        success = False\n",
    "        try:\n",
    "            r = json.loads(d[1])\n",
    "            success = True\n",
    "        except:\n",
    "            pass\n",
    "        \n",
    "        if not success:\n",
    "            try:\n",
    "                r = eval(d[1])\n",
    "                success = True\n",
    "            except:\n",
    "                pass\n",
    "                    \n",
    "        if success:\n",
    "            if isinstance(r, tuple):\n",
    "                en = [r_['english'] for r_ in r]\n",
    "                ms = [r_['malay'] for r_ in r]\n",
    "                en = ' '.join(en)\n",
    "                ms = ' '.join(ms)\n",
    "            else:\n",
    "                en = r['english']\n",
    "                ms = r['malay']\n",
    "        else:\n",
    "            en = None\n",
    "            ms = None\n",
    "    \n",
    "    if en or ms:\n",
    "        filtered.append({\n",
    "            'left': left,\n",
    "            'en': en,\n",
    "            'ms': ms\n",
    "        })           "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\"Ada satu hari tu, abah bawa aku ikut dia pergi kerja sebab masa tu taka da sesiapa nak jaga aku kat rumah. Masa tu umur aku 6 tahun. Kat tempat kerja abah tu ada kontena kosong, abah letak aku kat situ. Dari situ aku boleh je tengok abah bekerja. Disebabkan abah kerja kat kilang jahit baju, aku masa tu ingatkan abah tukang jahit. Tapi apa yang aku nampak depan mata aku masa tu ialah Abah mencangkung sambil kutip sampah. Ya, Abah aku kerja tukang kutip sampah dekat kilang jahit. Masa tu aku umur 6 tahun, aku tak fikir apa-apa. Kat sekolah, masa kecik aku pandai sikit. Abah tanya periksa dapat nombor berapa. Aku cakap aku dapat nombor 3. Masa tu aku suka sangat makan roti pong (roti yang ada kismis kat tengah). Boleh pulak abah tanya as hadiah nak dia belikan roti pong atau basikal. Eh mestilah aku nak basikal! Haha tapi malangnya masa tu ak kecik lagi so aku tak tahu macam mana susah payahnya abah aku bekerja nak belikan aku basikal.. Ada satu hari abah belikan aku air kotak milo. Aku pun minum sampai balik rumah. Masa tu aku minum sambil main-main sampai la air milo tu tertumpah atas baju. Baju gambar Digimon warna hijau aku ingat lagi haha. Abah yang penat baru balik kerja suruh aku bersihkan baju aku yang terkena tumpahan air milo tadi. Masa tu rumah kami cuma rumah tunggal, yang mana hanya ada ruang tamu je. Tak ada bilik. Kat situ la tempat makan, kat situ la tempat tidur. Bilik air kat luar. Lepas tu aku pun pergi dapur untuk basuh baju yang aku pakai tu. Tiba- tiba abah muncul kat pintu, ''Adik..kenapa adik basuh baju ni..? Ni kerja abah.. adik naik atas, biar abah basuh baju ni,'' Disebabkan masa tu aku baru 7 tahun, aku tak fikir apa dan terus tinggalkan baju kotor tu dan naik atas. Abah yang penat balik kerja tu yang basuh baju kotor aku. Masa umur aku 8 tahun, ajal abah dah tiba, abah pergi tinggalkan kami adik beradik. Sebab apa, aku tak tahu. Aku ingat lagi masa tu petang, dalam pukul 6.30 macam tu. Abah baru balik dari kedai dan terus pengsan. Lepas tu jiran aku bawa abah naik atas rumah dan abah dalam keadaan separa sedar. Aku nampak keadaan abah masa tu, muka abah pucat. Abah tak kenal kawan2 sekeliling dia. Tapi abah kenal aku dengan kakak aku masa tu. Masa tu aku tak datang dekat abah sebab masa tu aku takut tengok keadaan abah macam tu. Bila lama-lama, baru aku beranikan diri nak dekat dengan abah. Wasiat terakhir abah kat aku cuma belajar rajin-rajin dan dengar cakap kakak atau sesiapa saja yang bakal jaga aku selepas tu. Bila dah dewasa, aku kesal. Kenapa aku tak datang sejurus abah panggil aku masa tu. Kalau aku datang awal, mesti banyak yang abah nak sampaikan dekat aku. Bila abah dah tak ada, baru aku rasa kehilangan. Aku menangis hari-hari terkenangkan abah.\""
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "left"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'english': \"Since I moved schools when I was 8 years old, I haven't really liked coming to school. You can imagine what learning was like in second grade. We had to draw pictures of our families in class, make cards for Mother's Day, Father's Day, and so on. From second grade until high school, I only gave cards to my teachers. On Mother's Day, I gave a card to my female teacher, and on Father's Day, I gave a card to my male teacher. It was like that until high school when we had Civics and Citizenship as a subject, and even then, I gave a card to my teacher haha. One night, my siblings and I went to our grandfather's house. Our uncle and cousin were there too. Our uncle was the type to lecture us when he was angry, and then he would just sweep the trash. Whenever I went to my grandfather's house, he was always sweeping. Anyway, one night, he got angry at his child and said, 'Don't be like a child without parents, neglected with no future.' I felt it even though I was still young. But luckily, my siblings and I weren't the type to argue or anything. We just went to see our grandfather, not him. But I still remember things like that until now. It's normal, when you're struggling and at your lowest point, no one wants to admit they're related to you or help you with anything.\",\n",
       " 'malay': \"Sejak aku pindah sekolah masa umur aku 8 tahun, aku memang tak suka sangat nak datang sekolah. Aku rasa kau orang boleh imagine pembelajaran sekolah masa darjah 2 tu macam mana. Sikit-sikit kena lukis gambar family dalam kelas la, buat kad untuk hari ibu la, buat kad untuk hari bapa and so on. Dari aku darjah 2 sampailah aku sekolah menengah, aku hantar kad dekat cikgu je. Bila hari ibu aku bagi kad kat cikgu perempuan, bila hari bapa pulak aku bagi kad kat cikgu lelaki. Macam tu lah sampai sekolah menengah bila ada subjek Sivik & Kewarganegaraan, pun aku bagi kad dekat cikgu jugak haha Pernah satu malam tu, aku adik beradik pergi rumah atuk aku. Rumah atuk aku ada pakcik aku dan sepupu aku. Pakcik aku ni jenis yang kalau marah dia suka membebel panjang-panjang. Lepas tu dia kerjanya menyapu sampah je. Asal aku pergi rumah atuk aku mesti dia tengah meyapu. Ok sambung cerita balik. Entah kenapa satu malam tu dia marah anak dia tiba-tiba dia cakap 'kau jangan jadi macam anak tak ada mak bapak, terbiar tak ada masa depan,' masa tu terasa la jugak walaupun aku kecik lagi. Tapi nasib baik la adik beradik aku ni bukan kaki gaduh ke apa. Kitorang relax je dengan niat pergi nak tengok atuk, bukan tengok dia. Tapi aku ingat lagi la sampai sekarang benda-benda macam tu. Biasalah, bila kau susah dan berada di titik terendah, tak ada sesiapa nak mengaku sedara, nak tolong ini itu.\"}"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "r"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(333758, 475485)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(filtered), len(iium_confession)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def reject(k):\n",
    "    if 'saya tidak dapat menterjemah teks' in k.lower():\n",
    "        return\n",
    "    if 'saya tidak boleh menterjemah kandungan yang tidak sesuai' in k.lower():\n",
    "        return\n",
    "    if 'saya tidak boleh menterjemah teks' in k.lower():\n",
    "        return\n",
    "    if 'teks yang diberikan tidak mempunyai makna' in k.lower():\n",
    "        return\n",
    "    if 'teks yang disediakan tidak boleh diterjemahkan' in k.lower():\n",
    "        return\n",
    "    if 'teks yang diberikan tidak masuk akal' in k.lower():\n",
    "        return\n",
    "    if 'teks yang diberikan nampaknya tidak' in k.lower():\n",
    "        return\n",
    "    if 'teks yang diberikan tidak mempunyai maksud' in k.lower():\n",
    "        return\n",
    "    if 'teks yang diberikan bukan dalam mana-mana bahasa' in k.lower():\n",
    "        return\n",
    "    if 'teks ini tidak boleh diterjemahkan' in k.lower():\n",
    "        return\n",
    "    if 'saya tidak dapat menterjemah frasa' in k.lower():\n",
    "        return\n",
    "    if 'teks yang disediakan bukan' in k.lower():\n",
    "        return\n",
    "    if 'teks yang diberikan tidak jelas' in k.lower():\n",
    "        return\n",
    "    if 'teks yang diberikan tidak' in k.lower():\n",
    "        return\n",
    "    if 'terjemahan teks kepada' in k.lower():\n",
    "        return\n",
    "    if 'saya tidak boleh menterjemah bahasa' in k.lower():\n",
    "        return\n",
    "    if 'model bahasa AI' in k:\n",
    "        return\n",
    "    if 'bahasa melayu standard' in k.lower():\n",
    "        return\n",
    "    if 'return JSON structure' in k:\n",
    "        return\n",
    "    if 'teks yang diberikan bukan dalam bahasa' in k.lower():\n",
    "        return\n",
    "    if 'teks yang anda berikan bukan' in k.lower():\n",
    "        return\n",
    "    if 'teks yang disediakan tidak' in k.lower():\n",
    "        return\n",
    "    if 'teks yang diberikan bukan' in k.lower():\n",
    "        return\n",
    "    if 'saya tidak dapat menterjemah' in k.lower():\n",
    "        return\n",
    "    if 'tetapi teks yang diberikan' in k.lower():\n",
    "        return\n",
    "    \n",
    "    return True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "333758 processed-iium-confession.jsonl\r\n"
     ]
    }
   ],
   "source": [
    "!wc -l processed-iium-confession.jsonl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('processed-iium-confession.jsonl', 'w') as fopen:\n",
    "    for d in filtered:\n",
    "        try:\n",
    "#             if not reject(d['ms']):\n",
    "#                 print(d)\n",
    "#                 continue\n",
    "                # break\n",
    "            fopen.write(f'{json.dumps(d)}\\n')\n",
    "        except:\n",
    "            pass"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
